
## =====================================================================================================================
## Genderizes inventor names using Genderize.io
## Note that one needs an API key.
## Zihao Li. 06/2024
## =====================================================================================================================

import sys, os
import csv
import string
import pandas as pd
from genderize import Genderize

data_dir = r'/Volumes/Zihao_SSD2/PatentsView/'
os.chdir(data_dir)

# Input: inventor/examiner firstname (a string)
# Output: A dictionary (keys: name, count, gender, probability)
def assign_gender(firstname):
    info = Genderize(api_key='your-api-key', timeout=60).get([firstname])[0]
    name = info['name']
    count = info['count']
    gender = info['gender']
    prob = info['probability']
    gendered_row = {'name': name, 'count': count, 'gender': gender, 'probability': prob}

    return gendered_row


def main():
    """
    Comment out other rounds while running current round
    """

    ## =====================================================================================================================
    ## This part is for inventors
    ## =====================================================================================================================

    # # 1st round of genderization
    # # Initial round (using the firstname coming directly from clean_patview.do)

    # df = pd.read_csv(data_dir + 'temp/g_inventor_clean_firstname.csv', encoding='utf-8')
    # out_file = open(data_dir + 'temp/g_inventor_firstname_gendered_r1.csv', 'a+')

    # for i in range(0, df.shape[0]):
    #     try:
    #         columns = ['name', 'count', 'gender', 'probability']
    #         w = csv.DictWriter(out_file, fieldnames=columns)
    #         if i == 0:
    #             w.writeheader()
    #         gendered_row = assign_gender(df['first_name_clean'][i])
    #         w.writerow(gendered_row)  # write the dictionary as a row in the dataframe
    #         print(i, gendered_row)
    #     except:
    #         print(i, '***ERROR***', df['first_name_clean'][i])


    # # 2nd round of genderization
    # # Drop special characters (excluding '-'). 

    # df = pd.read_csv(data_dir + 'temp/g_inventor_clean_firstname.csv', encoding='utf-8')
    # df_gendered = pd.read_csv(data_dir + 'temp/g_inventor_firstname_gendered_r1.csv', encoding='utf-8')
    # df = df.merge(df_gendered, how='left', left_on='first_name_clean', right_on='name');   del df_gendered
    # df['name_cleaned'] = df['first_name_clean']
    # df['round'] = 1
    # df = df[['first_name_clean','name_cleaned','name','round','count','gender','probability']]

    # letters_lower = list(string.ascii_lowercase)
    # letters_upper = list(string.ascii_uppercase)
    # numbers = [str(i) for i in range(10)]
    # allowed_strings = letters_lower + letters_upper + numbers + ['-']
    # characters = df['first_name_clean'].str.cat(sep ='')
    # characters = set([c for c in characters])
    # characters = [c for c in characters if c not in allowed_strings]
    # for i in range(len(characters)):
    #     df.loc[df['gender'].isna(),'name_cleaned'] = df.loc[df['gender'].isna(),'name_cleaned'].str.replace(characters[i],'')

    # for i in range(0, df.shape[0]):
    #     if pd.isna(df.loc[i,'gender']):
    #         try:
    #             gendered_row = assign_gender(df['name_cleaned'][i])
    #             df.loc[i,['name','count','gender','probability']] = gendered_row
    #             df.loc[i,'round'] = 2
    #             print(i, gendered_row)
    #         except:
    #             df.loc[i,'round'] = -1
    #             print(i, '***ERROR***', df['first_name_clean'][i])
    
    # df.to_csv(data_dir + 'temp/g_inventor_firstname_gendered_r2.csv', index=False)


    # # 3rd round of genderization
    # # Dealing with '-'

    # df = pd.read_csv(data_dir + 'temp/g_inventor_firstname_gendered_r2.csv' ,encoding='utf-8')
    # indexes = df.loc[(~df['name_cleaned'].isna()) & (df['name_cleaned'].str.contains('-')) & (df['gender'].isna()) & (df['round']!=-1)].index

    # for i in indexes:
    #     name_cleaned = df.loc[i,'name_cleaned']
    #     name = name_cleaned.replace('-','')
    #     gendered_row = assign_gender(name)
        
    #     df.loc[i,['name','count','gender','probability']] = gendered_row
    #     df.loc[i,'round'] = 3
    #     print(i, gendered_row)

    # df.to_csv(data_dir+'temp/g_inventor_firstname_gendered_r3.csv', index=False)
    


    
    
    ## =====================================================================================================================
    ## This part is for examiners
    ## =====================================================================================================================

    # 1st round of genderization
    # Initial round (using the firstname coming directly from clean_patview_examiner.do)

    # df = pd.read_csv(data_dir + 'temp/g_examiner_clean_firstname.csv', encoding='utf-8')
    # out_file = open(data_dir + 'temp/g_examiner_firstname_gendered_r1.csv', 'a+')

    # for i in range(0, df.shape[0]):
    #     try:
    #         columns = ['name', 'count', 'gender', 'probability']
    #         w = csv.DictWriter(out_file, fieldnames=columns)
    #         if i == 0:
    #             w.writeheader()
    #         gendered_row = assign_gender(df['first_name_clean'][i])
    #         w.writerow(gendered_row)  # write the dictionary as a row in the dataframe
    #         print(i, gendered_row)
    #     except:
    #         print(i, '***ERROR***', df['first_name_clean'][i])


    # 2nd round of genderization
    # Drop special characters (excluding '-'). 

    # df = pd.read_csv(data_dir + 'temp/g_examiner_clean_firstname.csv', encoding='utf-8')
    # df_gendered = pd.read_csv(data_dir + 'temp/g_examiner_firstname_gendered_r1.csv', encoding='utf-8')
    # df = df.merge(df_gendered, how='left', left_on='first_name_clean', right_on='name');   del df_gendered
    # df['name_cleaned'] = df['first_name_clean']
    # df['round'] = 1
    # df = df[['first_name_clean','name_cleaned','name','round','count','gender','probability']]

    # letters_lower = list(string.ascii_lowercase)
    # letters_upper = list(string.ascii_uppercase)
    # numbers = [str(i) for i in range(10)]
    # allowed_strings = letters_lower + letters_upper + numbers + ['-']
    # characters = df['first_name_clean'].str.cat(sep ='')
    # characters = set([c for c in characters])
    # characters = [c for c in characters if c not in allowed_strings]
    # for i in range(len(characters)):
    #     df.loc[df['gender'].isna(),'name_cleaned'] = df.loc[df['gender'].isna(),'name_cleaned'].str.replace(characters[i],'')

    # for i in range(0, df.shape[0]):
    #     if pd.isna(df.loc[i,'gender']):
    #         try:
    #             gendered_row = assign_gender(df['name_cleaned'][i])
    #             df.loc[i,['name','count','gender','probability']] = gendered_row
    #             df.loc[i,'round'] = 2
    #             print(i, gendered_row)
    #         except:
    #             df.loc[i,'round'] = -1
    #             print(i, '***ERROR***', df['first_name_clean'][i])
    
    # print('Exporting dataset...')
    # df.to_csv(data_dir + 'temp/g_examiner_firstname_gendered_r2.csv', index=False)


    # 3rd round of genderization
    # Dealing with '-'

    # df = pd.read_csv(data_dir + 'temp/g_examiner_firstname_gendered_r2.csv' ,encoding='utf-8')
    # indexes = df.loc[(~df['name_cleaned'].isna()) & (df['name_cleaned'].str.contains('-')) & (df['gender'].isna()) & (df['round']!=-1)].index

    # for i in indexes:
    #     name_cleaned = df.loc[i,'name_cleaned']
    #     name = name_cleaned.replace('-','')
    #     gendered_row = assign_gender(name)
        
    #     df.loc[i,['name','count','gender','probability']] = gendered_row
    #     df.loc[i,'round'] = 3
    #     print(i, gendered_row)

    # print('Exporting dataset...')
    # df.to_csv(data_dir+'temp/g_examiner_firstname_gendered_r3.csv', index=False)





    # =====================================================================================================================
    # This part is for attorneys
    # =====================================================================================================================

    # 1st round of genderization
    # Initial round (using the firstname coming directly from clean_patview_attorney.do)

    # df = pd.read_csv(data_dir + 'temp/g_attorney_clean_firstname.csv', encoding='utf-8')
    # out_file = open(data_dir + 'temp/g_attorney_firstname_gendered_r1.csv', 'a+')

    # for i in range(0, df.shape[0]):
    #     try:
    #         columns = ['name', 'count', 'gender', 'probability']
    #         w = csv.DictWriter(out_file, fieldnames=columns)
    #         if i == 0:
    #             w.writeheader()
    #         gendered_row = assign_gender(df['first_name_clean'][i])
    #         w.writerow(gendered_row)  # write the dictionary as a row in the dataframe
    #         print(i, gendered_row)
    #     except:
    #         print(i, '***ERROR***', df['first_name_clean'][i])


    # 2nd round of genderization
    # Drop special characters (excluding '-'). 

    # df = pd.read_csv(data_dir + 'temp/g_attorney_clean_firstname.csv', encoding='utf-8')
    # df_gendered = pd.read_csv(data_dir + 'temp/g_attorney_firstname_gendered_r1.csv', encoding='utf-8')
    # df = df.merge(df_gendered, how='left', left_on='first_name_clean', right_on='name');   del df_gendered
    # df['name_cleaned'] = df['first_name_clean']
    # df['round'] = 1
    # df = df[['first_name_clean','name_cleaned','name','round','count','gender','probability']]

    # letters_lower = list(string.ascii_lowercase)
    # letters_upper = list(string.ascii_uppercase)
    # numbers = [str(i) for i in range(10)]
    # allowed_strings = letters_lower + letters_upper + numbers + ['-']
    # characters = df['first_name_clean'].str.cat(sep ='')
    # characters = set([c for c in characters])
    # characters = [c for c in characters if c not in allowed_strings]
    # for i in range(len(characters)):
    #     df.loc[df['gender'].isna(),'name_cleaned'] = df.loc[df['gender'].isna(),'name_cleaned'].str.replace(characters[i],'')

    # for i in range(0, df.shape[0]):
    #     if pd.isna(df.loc[i,'gender']):
    #         try:
    #             gendered_row = assign_gender(df['name_cleaned'][i])
    #             df.loc[i,['name','count','gender','probability']] = gendered_row
    #             df.loc[i,'round'] = 2
    #             print(i, gendered_row)
    #         except:
    #             df.loc[i,'round'] = -1
    #             print(i, '***ERROR***', df['first_name_clean'][i])
    
    # print('Exporting dataset...')
    # df.to_csv(data_dir + 'temp/g_attorney_firstname_gendered_r2.csv', index=False)


    # 3rd round of genderization
    # Dealing with '-'

    df = pd.read_csv(data_dir + 'temp/g_attorney_firstname_gendered_r2.csv' ,encoding='utf-8')
    indexes = df.loc[(~df['name_cleaned'].isna()) & (df['name_cleaned'].str.contains('-')) & (df['gender'].isna()) & (df['round']!=-1)].index

    for i in indexes:
        name_cleaned = df.loc[i,'name_cleaned']
        name = name_cleaned.replace('-','')
        gendered_row = assign_gender(name)
        
        df.loc[i,['name','count','gender','probability']] = gendered_row
        df.loc[i,'round'] = 3
        print(i, gendered_row)

    print('Exporting dataset...')
    df.to_csv(data_dir+'temp/g_attorney_firstname_gendered_r3.csv', index=False)


if __name__ == "__main__":
    main()







